skimr::skim(ny_noaa)
## Skim summary statistics
## n obs: 2595176
## n variables: 7
##
## ── Variable type:character ────────────────────────────────────────────────
## variable missing complete n min max empty n_unique
## id 0 2595176 2595176 11 11 0 747
## tmax 1134358 1460818 2595176 1 4 0 532
## tmin 1134420 1460756 2595176 1 4 0 548
##
## ── Variable type:Date ─────────────────────────────────────────────────────
## variable missing complete n min max median
## date 0 2595176 2595176 1981-01-01 2010-12-31 1997-01-21
## n_unique
## 10957
##
## ── Variable type:integer ──────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75 p100
## prcp 145838 2449338 2595176 29.82 78.18 0 0 0 23 22860
## snow 381221 2213955 2595176 4.99 27.22 -13 0 0 0 10160
## snwd 591786 2003390 2595176 37.31 113.54 0 0 0 0 9195
## hist
## ▇▁▁▁▁▁▁▁
## ▇▁▁▁▁▁▁▁
## ▇▁▁▁▁▁▁▁
The ny noaa data were accessed from the NOAA National Climatic Data Center, consisting of 2595176 rows and 7. Key variables include date by date, prcipitation by prcp, snowfall by snow, depth of snow by snwd, highest temparture by tmax, and lowest temparture by tmin. For tmax and tmin, 0.4371025 of the data is missing, which is a big problem if we are trying to calculate temparature related results because we cannot be sure whether the missing data would lead to a different result.
data cleaning
ny_noaa_cleaned = separate(ny_noaa, date, into = c("year", "month", "day"), sep = "-") %>%
mutate(tmax = as.integer(tmax)) %>%
mutate(tmax = tmax/10) %>%
mutate(tmin = as.integer(tmin)) %>%
mutate(tmin = tmin/10) %>%
mutate(prcp = prcp/10) %>%
sample_n(5000)
skimr::skim(ny_noaa_cleaned)
## Skim summary statistics
## n obs: 5000
## n variables: 9
##
## ── Variable type:character ────────────────────────────────────────────────
## variable missing complete n min max empty n_unique
## day 0 5000 5000 2 2 0 31
## id 0 5000 5000 11 11 0 587
## month 0 5000 5000 2 2 0 12
## year 0 5000 5000 4 4 0 30
##
## ── Variable type:integer ──────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75 p100 hist
## snow 713 4287 5000 5.56 27.33 0 0 0 0 406 ▇▁▁▁▁▁▁▁
## snwd 1131 3869 5000 35.37 109.84 0 0 0 0 1067 ▇▁▁▁▁▁▁▁
##
## ── Variable type:numeric ──────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75 p100
## prcp 260 4740 5000 3.08 8.57 0 0 0 2.3 274.3
## tmax 2218 2782 5000 13.99 11.3 -22.2 4.4 15 23.9 36.7
## tmin 2221 2779 5000 3.05 10.56 -37.2 -3.9 3.3 11.7 25
## hist
## ▇▁▁▁▁▁▁▁
## ▁▁▃▆▆▆▇▂
## ▁▁▂▃▇▇▇▂
NYC average minimum temperature in Jan and July
tmax_jan_july = select(ny_noaa_cleaned, id, year, month, tmax, tmin, prcp) %>%
filter(month == "01" | month == "07") %>%
mutate(month = factor(month, labels = c("Jan", "July"))) %>%
group_by(id, year, month) %>%
summarize(mean_tmax = mean(tmax, na.rm = TRUE),
mean_tmin = mean(tmin, na.rm = TRUE),
mean_prcp = mean(prcp, na.rm = TRUE))
tmax_jan_july %>%
mutate(text_label = str_c('Year: ', year, ' MaxTemp: ', mean_tmax, ' C')) %>%
plot_ly(x = ~year, y = ~mean_tmax, type = "scatter", mode = "markers",
alpha = 0.5,
color = ~month,
text = ~text_label)
## Warning: Ignoring 312 observations
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
ny_noaa_cleaned %>%
filter(year == 2000 | year == 2001|year == 2002| year == 2003| year == 2004| year == 2005| year == 2006| year == 2007| year == 2008| year == 2009| year == 2010) %>%
filter(prcp < 10) %>%
plot_ly(y = ~prcp, color = ~year, type = "box",
colors = "Set2")
ny_noaa_cleaned %>%
count(id) %>%
mutate(id = fct_reorder(id, n)) %>%
plot_ly(x = ~id, y = ~n, color = ~id, type = "bar")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
nyc_airbnb %>%
count(neighbourhood) %>%
mutate(neighbourhood = fct_reorder(neighbourhood, n)) %>%
plot_ly(x = ~neighbourhood, y = ~n, color = ~neighbourhood, type = "bar")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors